########################################################
## This code carries out parameter tuning 
## for extended and restricted models
########################################################

########################################################
## Preparation of the workspace
########################################################

## remove all objects from the current workspace
rm(list=ls())

## load the required packages
library(haven)
library(caret)
library(randomForest)
library(doParallel)
library(mice)
library(plyr)
library(dplyr)
library(VIM)
library(base)
library(ranger)
library(glmnet)
library(xgboost)

## display the current time -> to check how much time it takes to run the code
start_time = Sys.time()

## set the directories
main <- "placeholder_main"

dataDirectory <- paste0(main, "/Data")
RDirectory <- paste0(main, "/Programs/Output Generation")

## Load the tuning and training functions:
source(paste0(RDirectory, "/102_0_caret_parameter_tuning_Function.R"))
source(paste0(RDirectory, "/103_0_caret_predictions_Function.R"))



########################################################
## Alternative unemployment horizons in 2006
########################################################

# Define the model that is used (the name of the model is a part of the name of the .dta file with data)
#models <- c("Full")
model_list <- "Full"

# Define the outcome variables that are predicted
dependent_variable <- c("emplAft3M_0M_In", "emplAft12M_0M_In")

# Define the time period:
years <- 2006


## Running the loop
for (model in model_list) {

  for (y in years) {


    for (dependent in dependent_variable){

      if (dependent %in% c("emplAft6M_0M_In", "emplAft3M_0M_In", "emplAft12M_0M_In")) {
        # Load the dataset:
        data <- read_dta(paste0(dataDirectory, "/002_DataForR_", model, "_", y,".dta"))


        # Run the tuning function:
        parameters <- tuning(data = data, dependent = dependent,
                             s_tuning = 0.1, seed = 2111, noisily = TRUE)

        write.csv(parameters$rfgrid_final, file = paste0(dataDirectory,"/102_rfgrid_" , model,"_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$boostgrid_final, file = paste0(dataDirectory,"/102_boostgrid_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$lassogrid_final, file = paste0(dataDirectory,"/102_lassogrid_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$rfgrid_search, file = paste0(dataDirectory,"/102_rfgrid_search_" , model,"_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$boostgrid_search, file = paste0(dataDirectory,"/102_boostgrid_search_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$lassogrid_search, file = paste0(dataDirectory,"/102_lassogrid_search_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)

        # Run the prediction function
        results <- estimating(data = data, dependent = dependent,
                              parameters = parameters,
                              s_tuning = 0.1, s_training = 0.3, seed = 2111, noisily = TRUE)

      } else {
        results <- estimating(data = data, dependent = dependent,
                              parameters = parameters,
                              s_tuning = 0, s_training = 0.4, seed = 2111, noisily = TRUE)
      }

      # Save models
      models <- results$models
      save(models,
           file=paste0(dataDirectory,"/103_Models_", model,"_",dependent, "_", y, ".rda"))

      # Save predictions
      write.csv(results$output,
                file=paste0(dataDirectory,"/103_predictionsR_", model,"_",dependent, "_", y, ".csv"))

    }
  }
}

########################################################
## Other models (only at 0M)
########################################################

# Define the model that is used (the name of the model is a part of the name of the .dta file with data)
model_list <- c(
            # Marginal sub_models:
            "Full_Marg_incIndiv","Full_Marg_emplHist", "Full_Marg_incOther",
            "Full_Marg_incHist","Full_Marg_migHist","Full_Marg_indu",
            "Full_Marg_mun",
            # Sequential sub-models:
            "Full_SeqDrop_mun", "Full_SeqDrop_indu", "Full_SeqDrop_incHist",
            "Full_SeqDrop_emplHist", "Full_SeqDrop_incOther",
            "Full_SeqDrop_migHist",
            # Models dropping past spell information:
            "Full_DropPastSpells_DaysUnemp", "Full_DropPastSpells_unemplSpells", "Full_DropPastSpells_Both",
            # Extended models (Full sample):
            "EX_FullSample_UI", "EX_FullSample_WE", "EX_FullSample_OC",
            "EX_FullSample_RR", "EX_FullSample_IQ", "EX_FullSample_UM",
            "EX_FullSample_ALL",
            # Extended models (Basic + other data):
            "EX_Ba_FullSample_UI", "EX_Ba_FullSample_WE", "EX_Ba_FullSample_OC",
            "EX_Ba_FullSample_RR", "EX_Ba_FullSample_IQ", "EX_Ba_FullSample_UM",
            "EX_Ba_FullSample_ALL", "EX_FullSample_Baseline", "EX_Ba_FullSample_Baseline")


# Define the outcome variables that are predicted
dependent_variable <- c("emplAft6M_0M_In")

# Define the time period:
years <- 2006


## Running the loop

for (model in model_list) {

  for (y in years) {


    for (dependent in dependent_variable){

      if (dependent == "emplAft6M_0M_In") {
        # Load the dataset:
        data <- read_dta(paste0(dataDirectory, "/002_DataForR_", model, "_", y,".dta"))


        # Run the tuning function:
        parameters <- tuning(data = data, dependent = dependent,
                             s_tuning = 0.1, seed = 2111, noisily = TRUE)

        write.csv(parameters$rfgrid_final, file = paste0(dataDirectory,"/102_rfgrid_" , model,"_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$boostgrid_final, file = paste0(dataDirectory,"/102_boostgrid_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$lassogrid_final, file = paste0(dataDirectory,"/102_lassogrid_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$rfgrid_search, file = paste0(dataDirectory,"/102_rfgrid_search_" , model,"_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$boostgrid_search, file = paste0(dataDirectory,"/102_boostgrid_search_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$lassogrid_search, file = paste0(dataDirectory,"/102_lassogrid_search_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)

        # Run the prediction function
        results <- estimating(data = data, dependent = dependent,
                              parameters = parameters,
                              s_tuning = 0.1, s_training = 0.3, seed = 2111, noisily = TRUE)

      } else {
        results <- estimating(data = data, dependent = dependent,
                              parameters = parameters,
                              s_tuning = 0, s_training = 0.4, seed = 2111, noisily = TRUE)
      }

      # Save models
      models <- results$models
      save(models,
           file=paste0(dataDirectory,"/103_Models_", model,"_",dependent, "_", y, ".rda"))

      # Save predictions
      write.csv(results$output,
                file=paste0(dataDirectory,"/103_predictionsR_", model,"_",dependent, "_", y, ".csv"))

    }
  }
}
    

########################################################
## Basic model at all durations
########################################################

# Define the model that is used (the name of the model is a part of the name of the .dta file with data)
#models <- c("Full")
model_list <- "Full_SeqDrop_incIndiv"

# Define the outcome variables that are predicted
dependent_variable <- c("emplAft6M_0M_In", "emplAft6M_6M_In", "emplAft6M_12M_In")

# Define the time period:
years <- 2006
   

for (model in model_list) {
  
  for (y in years) {
    
    for (dependent in dependent_variable){
      
      if (dependent == "emplAft6M_0M_In") {
        # Load the dataset:
        data <- read_dta(paste0(dataDirectory, "/002_DataForR_", model, "_", y,".dta"))

        # Run the tuning function:
        parameters <- tuning(data = data, dependent = dependent,
                             s_tuning = 0.1, seed = 2111, noisily = TRUE)
        
        write.csv(parameters$rfgrid_final, file = paste0(dataDirectory,"/102_rfgrid_" , model,"_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$boostgrid_final, file = paste0(dataDirectory,"/102_boostgrid_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$lassogrid_final, file = paste0(dataDirectory,"/102_lassogrid_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$rfgrid_search, file = paste0(dataDirectory,"/102_rfgrid_search_" , model,"_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$boostgrid_search, file = paste0(dataDirectory,"/102_boostgrid_search_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        write.csv(parameters$lassogrid_search, file = paste0(dataDirectory,"/102_lassogrid_search_", model, "_", dependent, "_", y, ".csv"), row.names = FALSE)
        
        # Run the prediction function
        results <- estimating(data = data, dependent = dependent,
                              parameters = parameters,
                              s_tuning = 0.1, s_training = 0.3, seed = 2111, noisily = TRUE)
        
      } else {
        results <- estimating(data = data, dependent = dependent,
                              parameters = parameters,
                              s_tuning = 0, s_training = 0.4, seed = 2111, noisily = TRUE)
      }
      
      # Save models
      models <- results$models
      save(models, 
           file=paste0(dataDirectory,"/103_Models_", model,"_",dependent, "_", y, ".rda"))
      
      # Save predictions
      write.csv(results$output, 
                file=paste0(dataDirectory,"/103_predictionsR_", model,"_",dependent, "_", y, ".csv"))
      
    }
  }
}   

## Creating predictions for people unemployed in X month using Y month model
for(model in model_list){
  
  for (y in years) {
    
    for(dependent in dependent_variable){
      
      print(paste(dependent,"Start")) ## display the time when loop starts for a year and y variable
      
      ########################################################
      ## Load the dataset
      ##################################################
      data <- read_dta(paste0(dataDirectory, "/002_DataForR_", model, "_", y,".dta"))

      ########################################################
      ## Define the sub-sample for creation of predictions
      ########################################################
      
      ## Keeping only 30% of sample for creation of predictions
      n_parametertuning <- round(nrow(data)*0.1, digits=0) + 1
      n_training <- round((nrow(data)*0.3), digits=0)
      n_training_plus_tuning <- n_training + n_parametertuning
      n_training_plus_tuning_1 <- n_training_plus_tuning + 1
      
      first_column <- which(colnames(data)=="Gender") ## Identify column number where the variables of interest start
      
      ## Keeping only those observations with non-missing outcome variables
      y_column <- which(colnames(data) == dependent) ## Identify column number with the dependent variable of interest
      data <- data[complete.cases(data[, y_column]), ] ## Restrict the dataset to observations with non-missing dependent variable
      
      ## Creating dataset for training and predictions
      data_pred <- data[data$n_order >= n_training_plus_tuning_1,] ## Data for predictions (hold-out sample)
      
      total_y <- factor(data_pred[[dependent]]) ## declare the y variable as a factor/category
      total_x <- data_pred[, first_column:ncol(data_pred)] ## Keeping all the possible covariates we want to have in the model
      total_final <- as.data.frame(cbind(total_y, total_x)) ## Creating final dataset used for predictions
      
      persinfo <- as.data.frame(cbind(data_pred$LopNr_PersonNr, data_pred$n)) ## Creating a dataset with individual ID and n for the predictions
      
      ## Correcting format of outcome variables
      levels(total_final$total_y) <- c("no", "yes")
      
      ## Removes the initial dataset that was loaded and the other intermediate datasets
      rm("data")
      rm("data_pred")
      rm("total_final")
      
      ##
      
      for (unempl in c(0, 6, 12)) {
        
        if (dependent !=paste0("emplAft6M_",unempl,"M_In")) {
          
          ########################################################
          ## Load models
          ########################################################
          
          load(file=paste0(dataDirectory,"/103_Models_", model, "_emplAft6M_", unempl, "M_In", "_", y, ".rda"))
          
          ########################################################
          ## Create predictions
          ########################################################
          
          ## Creating predictions Random Forest
          prob_rf <- predict(models$rff_final, total_x)
          prob_rf <- prob_rf[["predictions"]]
          prob_rf = data.frame(prob_rf[,2])
          
          ## Creating predictions Gradient Boost
          total_x_boost = data.matrix(total_x)
          prob_boost <- predict(models$rboost_final, total_x_boost)
          rm("total_x_boost") ## remove the data format that is specific gradient boost
          
          ## Creating predictions LASSO
          total_x_lasso = data.matrix(total_x)
          prob_lasso <- predict(models$rlasso_final, total_x_lasso, type = "response")
          rm("total_x_lasso") ## remove the data format that is specific to lasso
          
          ########################################################
          ## Save predictions
          ########################################################
          output <- cbind(total_y, prob_rf, prob_boost, prob_lasso, persinfo) ## put all the predictions together, with personal ID, n and outcome variable
          write.csv(output, file=paste0(dataDirectory,"/103_predictionsR_", model,"_", dependent, "_", y,"_emplAft6M_",unempl,"M_In_Model.csv"))
          rm("output") ## remove the dataset with output
          
          print(paste(dependent,"End")) ## display the time when loop ends for a year and y variable
          
          
          
        }
        
      }
      
    }
    
  }
}


